

library(haven)
library(stringr)
library(dplyr)
library(tidyr)
library(quanteda)
library(purrr)
library(tidyverse)
library(jsonlite)
library(quanteda.textstats)

rm(list = ls())

setwd("~/content of updates")

path <- "merged_feeds"
files <- dir(path, pattern = "*.json")

data <- files %>%
  map_df(~fromJSON(file.path(path, .), flatten = TRUE))

data$date <- gsub("T.*","",data$Published)
data$date2 <- ymd(data$date)

data$url <- gsub("posts.*","",data$Link)

# concatenate title and text
data$text <- paste(data$Title, data$Content, sep = "-")

# remove duplicated content
data <- data %>% distinct(date, text, url, .keep_all = TRUE)


# create DFM
tokens <- tokens(data$text, remove_punct = F, remove_numbers = F,
                 remove_symbols=F, remove_url=F)
tokens <- tokens_tolower(tokens)
dfm <- dfm(tokens)

updates_lexdiv <- textstat_lexdiv(dfm, measure=c("TTR"))
updates_read <- textstat_readability(data$text, measure=c("Flesch"))
updates_words <- sapply(strsplit(tolower(data$text), "\\s+"), length)

export <- as.data.frame(cbind(data$date, data$url, updates_lexdiv$TTR, updates_read$Flesch, updates_words))
colnames(export) <- c("date", "url", "updates_lexdiv", "updates_read", "updates_words")

write_dta(export, "content of updates.dta", version = 14)









